This notebook was used simply to reformat the accident data for pycharts. It transforms the data so that it aggregates the total number of accidents per borough for each date.


In [1]:
import numpy as np
import pandas as pd

In [2]:
# Load accident data.
accident_data = pd.read_csv('../../NYPD_Motor_Vehicle_Collisions.csv')

In [12]:
# Convert date column to datetime format so that we can sort properly.
accident_data['PARSED_DATE'] = pd.to_datetime(accident_data['DATE'])
accident_data = accident_data.sort_values(by=['PARSED_DATE'])
accident_data['YEAR'] = accident_data['PARSED_DATE'].dt.year
accident_data['MONTH'] = accident_data['PARSED_DATE'].dt.month

In [53]:
# Aggregate by borough for each date and then unstack to reflatten it back out so each
# borough is a column.
timeseries_data = accident_data.groupby(['PARSED_DATE', 'BOROUGH']).size().unstack(level=-1)

In [54]:
timeseries_data


Out[54]:
BOROUGH BRONX BROOKLYN MANHATTAN QUEENS STATEN ISLAND
PARSED_DATE
2012-07-01 39 135 119 101 26
2012-07-02 71 140 113 94 25
2012-07-03 73 167 150 136 22
2012-07-04 51 94 65 93 22
2012-07-05 60 153 107 123 24
2012-07-06 59 131 147 131 34
2012-07-07 49 111 98 93 27
2012-07-08 50 112 70 80 22
2012-07-09 58 117 104 113 16
2012-07-10 42 128 123 93 24
2012-07-11 51 140 110 102 24
2012-07-12 53 147 131 88 26
2012-07-13 59 143 128 103 24
2012-07-14 53 123 110 91 30
2012-07-15 46 114 108 82 25
2012-07-16 51 138 100 97 21
2012-07-17 53 145 137 124 25
2012-07-18 63 114 127 106 28
2012-07-19 36 104 118 106 24
2012-07-20 48 122 142 132 28
2012-07-21 54 95 120 105 13
2012-07-22 50 131 98 88 20
2012-07-23 47 130 109 95 13
2012-07-24 58 124 139 98 28
2012-07-25 70 136 131 110 24
2012-07-26 51 128 159 123 32
2012-07-27 52 140 145 123 35
2012-07-28 53 110 116 105 29
2012-07-29 42 62 84 83 26
2012-07-30 48 131 124 97 19
... ... ... ... ... ...
2017-01-09 65 113 80 96 21
2017-01-10 53 115 78 76 23
2017-01-11 55 129 74 106 26
2017-01-12 64 114 69 126 18
2017-01-13 55 134 70 107 16
2017-01-14 50 92 60 91 19
2017-01-15 55 90 73 89 11
2017-01-16 48 123 51 82 13
2017-01-17 60 123 88 105 17
2017-01-18 53 108 78 95 14
2017-01-19 51 126 92 94 13
2017-01-20 52 125 91 102 18
2017-01-21 55 86 52 79 8
2017-01-22 46 75 54 71 15
2017-01-23 61 142 120 93 12
2017-01-24 69 111 80 105 17
2017-01-25 45 98 84 95 23
2017-01-26 56 110 87 79 14
2017-01-27 49 107 85 104 12
2017-01-28 55 79 74 92 17
2017-01-29 32 88 63 68 10
2017-01-30 58 104 59 101 12
2017-01-31 56 117 70 100 19
2017-02-01 60 111 93 88 14
2017-02-02 64 118 71 95 18
2017-02-03 64 108 80 112 15
2017-02-04 48 111 65 83 8
2017-02-05 30 99 50 92 14
2017-02-06 51 115 72 88 7
2017-02-07 44 121 68 95 21

1683 rows × 5 columns


In [55]:
# Save output to file.
timeseries_data.to_csv('../website/data/accident_timeseries_data.csv')

In [13]:
# Aggregate by borough for each year and then unstack to reflatten it back out so each
# borough is a column.
timeseries_yearly_data = accident_data.groupby(['YEAR', 'BOROUGH']).size().unstack(level=-1)

In [14]:
timeseries_yearly_data


Out[14]:
BOROUGH BRONX BROOKLYN MANHATTAN QUEENS STATEN ISLAND
YEAR
2012 9509 23298 20716 19135 4902
2013 19766 47007 41577 39303 8295
2014 19679 47741 41247 41001 6612
2015 21240 50819 42497 42468 6339
2016 21653 47625 36688 41440 6717
2017 2012 4202 2772 3604 618

In [15]:
# Save output to file.
timeseries_yearly_data.to_csv('../website/data/accident_timeseries_yearly_data.csv')

In [16]:
# Aggregate by borough for each year/month and then unstack to reflatten it back out so each
# borough is a column.
timeseries_monthly_data = accident_data.groupby(['YEAR', 'MONTH', 'BOROUGH']).size().unstack(level=-1)

In [17]:
timeseries_monthly_data


Out[17]:
BOROUGH BRONX BROOKLYN MANHATTAN QUEENS STATEN ISLAND
YEAR MONTH
2012 7 1641 3884 3653 3204 768
8 1665 3915 3594 3149 849
9 1548 3786 3506 3150 808
10 1604 3881 3497 3174 831
11 1444 3752 3064 3186 798
12 1607 4080 3402 3272 848
2013 1 1486 3656 3017 3047 773
2 1541 3399 2856 2733 717
3 1746 3686 3285 3285 807
4 1602 3871 3358 3115 765
5 1808 4284 3841 3497 904
6 1748 4318 3739 3344 894
7 1749 3984 3597 3435 513
8 1605 3900 3541 3078 529
9 1666 3806 3565 3270 551
10 1580 4022 3703 3502 609
11 1610 4056 3622 3532 607
12 1625 4025 3453 3465 626
2014 1 1635 3908 3140 3400 627
2 1686 3590 2914 3338 612
3 1561 3719 3134 3177 531
4 1528 3767 3415 3202 501
5 1686 4204 3816 3475 527
6 1747 4247 3716 3626 629
7 1662 4069 3500 3428 526
8 1610 3905 3484 3357 505
9 1712 4163 3553 3513 555
10 1639 4222 3651 3655 606
11 1583 3926 3429 3433 514
12 1630 4021 3495 3397 479
2015 1 1704 3863 2875 3417 552
2 1435 3498 2670 2876 440
3 1823 4362 3396 3631 513
4 1623 3869 3521 3127 456
5 1837 4518 3879 3641 630
6 1831 4322 3728 3680 512
7 1821 4288 3842 3682 522
8 1878 4452 3707 3664 508
9 1751 4385 3639 3707 514
10 1866 4635 4023 3911 561
11 1788 4236 3545 3504 556
12 1883 4391 3672 3628 575
2016 1 1878 4434 3180 3854 614
2 1641 3756 3191 3044 522
3 1833 4399 3477 3700 561
4 1962 4432 3540 3579 521
5 2005 4381 3426 3741 610
6 1743 3807 2949 3380 582
7 1933 3896 3046 3495 568
8 1785 3715 2836 3342 516
9 1773 3696 2742 3320 533
10 1718 3711 2831 3383 581
11 1641 3736 2838 3255 534
12 1741 3662 2632 3347 575
2017 1 1651 3419 2273 2951 521
2 361 783 499 653 97

In [18]:
# Save output to file.
timeseries_monthly_data.to_csv('../website/data/accident_timeseries_monthly_data.csv')